Classify

Prototype pipeline: Parse tradb, extract features and train/compare classifiers...

In [1]:
%matplotlib notebook
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from _plot import distplot, boxenplot, pairplot, corrplot, plot_learning_curve
from _compute import compute_features, load_hits
from _preprocess import crop_tra
import _features
In [ ]:
# MSG
config = {
    "basedir": "/mnt/c/AE-Data/msg",
    "classes": ["class_1", "class_2", "class_3"]
}
In [2]:
# Lidy
config = {
    "basedir": "/mnt/c/AE-Data/Daten_TT",
    "classes": ["Brause_6884", "Faserreibung_6865", "Sand_6943", "Sand-Reibung_7001"]
}

Dataset

Explore the dataset based on the pridb files and standard hit features

In [3]:
df_hits = load_hits(config["basedir"], config["classes"])
Hits: 100%|██████████| 6931/6931 [00:00<00:00, 110535.48it/s]
Hits: 100%|██████████| 7011/7011 [00:00<00:00, 113474.18it/s]
Hits: 100%|██████████| 6929/6929 [00:00<00:00, 119391.72it/s]
Hits: 100%|██████████| 6988/6988 [00:00<00:00, 119710.00it/s]
In [4]:
plt.figure(figsize=(5, 2), tight_layout=True)
sns.countplot(y="class", data=df_hits)
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdd6c4c7f60>
In [5]:
boxenplot(df_hits, "class", cols=4, logscale=False)
Out[5]:
<seaborn.axisgrid.FacetGrid at 0x7fdd6c496e10>

Compute transient features

In [6]:
def extract(tra):   
    samples = 8192
    tra_crop = crop_tra(tra, samples, offset=None)

    return {
        **_features.standard(tra_crop),
        **_features.hit(tra_crop),
        **_features.spectral(
            tra_crop, f_min=10e3, f_max=500e3, n_bands=128, log_freq=False, n_mels=64, n_mels_max=None
        ),
    }

Try to imitate VisualClass feature extractor...

In [ ]:
#def extract(tra):
#    return _features.visual_class(tra, samples=2048, f_min=80, f_max=550e3)
In [7]:
%%time
df = compute_features(config["basedir"], config["classes"], extract, multiprocess=True, processes=7, chunksize=10)

print(f"Samples:  {df.shape[0]}")
print(f"Features: {df.shape[1] - 1}")
Brause_6884: 100%|██████████| 6929/6929 [00:07<00:00, 923.87it/s] 
Faserreibung_6865: 100%|██████████| 7011/7011 [00:03<00:00, 1951.76it/s]
Sand_6943: 100%|██████████| 6929/6929 [00:04<00:00, 1693.14it/s]
Sand-Reibung_7001: 100%|██████████| 6988/6988 [00:03<00:00, 1968.11it/s]
Samples:  27857
Features: 223
CPU times: user 6.47 s, sys: 2.31 s, total: 8.78 s
Wall time: 20.8 s

Feature visualization

In [8]:
distplot(df, "class")
Out[8]:
<seaborn.axisgrid.FacetGrid at 0x7fdd53969860>
In [9]:
boxenplot(df, "class")
Out[9]:
<seaborn.axisgrid.FacetGrid at 0x7fdd4befce80>
In [ ]:
# pairplot(df.sample(100), "class")
# pairplot(df, "class")

Feature correlation

In [10]:
corrplot(df)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdd09f092e8>

Fisher ratios

Rough estimate, considering only the traces of the within/between scatter matrices

In [12]:
def fisher_ratios(dataframe: pd.DataFrame, class_column: str):
    var_between = ((dataframe.groupby(class_column).mean() - dataframe.mean()) ** 2).sum()
    var_within = dataframe.groupby(class_column).var().sum()
    return var_between / var_within

fr = fisher_ratios(df, "class")
fr.sort_values(ascending=False).head(10)
Out[12]:
MFCC_3                 0.455799
MFCC_1                 0.288136
PP_36.80-40.62kHz      0.266522
ZCR                    0.260804
PP_32.97-36.80kHz      0.256241
PP_40.62-44.45kHz      0.229310
PP_44.45-48.28kHz      0.224210
PP_29.14-32.97kHz      0.221081
PP_25.31-29.14kHz      0.191012
PP_258.83-262.66kHz    0.182344
dtype: float64
In [14]:
plt.figure(tight_layout=True, figsize=(8, 0.15 * len(fr)))
fr[::-1].plot.barh()
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdc292945f8>
In [13]:
plt.figure(tight_layout=True, figsize=(9.6, 3))
fr.plot(marker=".", linestyle="None")
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdd0813e630>

Filter by fisher ratio

ToDo: Write sklearn feature selection processor

In [ ]:
# threshold = 0.1
# df_filtered = df[[*fr[fr > threshold].index, "class"]]

Prepare feature matrix (X) and labels (y) - train/test split

In [15]:
dfX = df.drop(columns="class")
dfy = df["class"].cat.codes

X = dfX.to_numpy()
y = dfy.to_numpy()

# train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

Train model

In [16]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.decomposition import PCA, FastICA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# feature scaling
scaler = MinMaxScaler()

# feature selection
n_features = 100
selector = None
#selector = SelectKBest(k=n_features)
#selector = RFE(RandomForestClassifier(n_jobs=-1), n_features_to_select=n_features, step=10)
#selector = PCA(n_components=n_features)
#selector = FastICA(n_components=n_features)

# classifier
#clf = GaussianNB()
clf = LinearSVC(dual=False)  # dual=False if n_samples > n_features
#clf = SVC(kernel="linear", probability=True)
#clf = SVC(kernel="poly", probability=True)
#clf = GradientBoostingClassifier()
#clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
#clf = ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
#clf = LinearDiscriminantAnalysis()
#clf = QuadraticDiscriminantAnalysis()

# build pipeline
pipeline = make_pipeline(
    *[step for step in (scaler, selector, clf) if step is not None]  # ignore None steps
)
print(pipeline)

# train
%time pipeline.fit(X_train, y_train)

# scores
scores = classification_report(
    y_test,
    pipeline.predict(X_test),
    target_names=df["class"].cat.categories
)
print(scores)
Pipeline(memory=None,
         steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('linearsvc',
                 LinearSVC(C=1.0, class_weight=None, dual=False,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0))],
         verbose=False)
CPU times: user 7.22 s, sys: 0 ns, total: 7.22 s
Wall time: 7.22 s
                   precision    recall  f1-score   support

      Brause_6884       0.86      0.85      0.85      1704
Faserreibung_6865       0.79      0.74      0.77      1766
Sand-Reibung_7001       0.80      0.81      0.81      1752
        Sand_6943       0.83      0.89      0.86      1743

         accuracy                           0.82      6965
        macro avg       0.82      0.82      0.82      6965
     weighted avg       0.82      0.82      0.82      6965

Confusion matrix

In [17]:
from sklearn.metrics import plot_confusion_matrix

class_names = df["class"].cat.categories
plot_confusion_matrix(pipeline, X_test, y_test, display_labels=class_names, cmap=plt.cm.Blues, normalize="true")
plt.tight_layout()

Cross validation

In [18]:
%%time
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate

cv = StratifiedShuffleSplit(n_splits=5, test_size=0.25)
scores = cross_validate(pipeline, X, y, cv=cv, n_jobs=5, scoring=[
    "accuracy",
    "precision_macro",
    "recall_macro",
    #"roc_auc_ovo",
    #"roc_auc_ovr",
])

[f"CV {key}: {arr.mean():.2f} +/- {arr.std():0.2f}" for key, arr in scores.items()]
CPU times: user 93.8 ms, sys: 3.81 s, total: 3.91 s
Wall time: 33.4 s
Out[18]:
['CV fit_time: 28.02 +/- 0.58',
 'CV score_time: 0.04 +/- 0.00',
 'CV test_accuracy: 0.82 +/- 0.00',
 'CV test_precision_macro: 0.82 +/- 0.00',
 'CV test_recall_macro: 0.82 +/- 0.00']

Plot learning curve

In [19]:
plot_learning_curve(pipeline, X, y, cv=cv, scoring="accuracy")

Feature importances

Get relevance of features from classifier or feature selection processor

In [20]:
def feature_importances(classifier, names):
    steps = [classifier]
    # check if classifier is a pipeline  
    if hasattr(classifier, "named_steps"):  # it's a pipeline
        steps = [step for name, step in pipeline.named_steps.items()]
        
    for step in steps:
        if hasattr(step, 'scores_'):  # SelectKBest
            return pd.Series(step.scores_, index=names)
        if hasattr(step, 'ranking_'):  # RFE
            return pd.Series(step.ranking_.max() - step.ranking_, index=names)  # inverse rank -> score
        if hasattr(step, "feature_importances_"):  # RandomForest
            return pd.Series(step.feature_importances_, index=names)
        if hasattr(step, "coef_"):  # SVM
            return pd.Series(abs(step.coef_).sum(axis=0), index=names)

feature_names = df.drop(columns="class").columns
feature_scores = feature_importances(pipeline, feature_names)

plt.figure(tight_layout=True, figsize=(8, 0.2 * len(feature_scores)))
pd.Series(feature_scores, index=feature_names).sort_values().plot(kind='barh')
plt.xlabel('Score (higher is better)')
plt.ylabel('Feature');

Feature selection

Recursive feature elaminiation (cross-validated)

First, the estimator is trained on the full set of features and the importance of each feature is obtained. The least important feature(s) are pruned from current set of features. That procedure is recursively repeated on the pruned set. Based on the cross-validation scores for every iteration, the optimal number and selection of features can be determined.

In [ ]:
from sklearn.preprocessing import minmax_scale
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

rfecv_step = 10
rfecv = RFECV(
    RandomForestClassifier(n_estimators=10, n_jobs=-1),
    step=rfecv_step, cv=5, n_jobs=-1,
)
%time rfecv.fit(minmax_scale(X), y)
print(f"Feature number of highest CV-score: {rfecv.n_features_}")
In [ ]:
plt.figure(figsize=(9.8, 3), tight_layout=True)
n_features = np.clip(
    X.shape[1] - np.arange(len(rfecv.grid_scores_)) * rfecv_step, 1, None,
)[::-1]
plt.plot(n_features, rfecv.grid_scores_, "--o")
plt.xlabel("Number of selected features")
plt.ylabel("CV-score")
In [ ]:
# X_selected = rfecv.transform(X)
# X_selected.shape

Which features correlate with false predictions?

In [ ]:
dfp = df.drop(columns="class")
dfp["predict"] = (pipeline.predict(X) == y)
In [ ]:
fisher_ratios(dfp, "predict").sort_values(ascending=False).head(20)
In [ ]:
boxenplot(dfp, "predict", logscale=True)
In [ ]:
distplot(dfp, "predict")
In [ ]: